import datetime
import pandas as pd
import glob
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np
from sklearn.preprocessing import normalize
from sqlalchemy import create_engine
from helper_functions import load_data, get_data4uid
If you didn't load the raw data from Tankerkoenig you can start here with loading the data from sqllite database. Here i will have a look at some interesting points out of the data.
# run this cell to load data from sql file
prices_pd, stations_pd = load_data('./Data/prices_40.sql')
#prices_pd = prices_pd.set_index('date')
prices_pd.info()
# make uuid as category
prices_pd.station_uuid = prices_pd.station_uuid.astype('category')
prices_pd.info()
prices_pd.isnull().values.any()
So we have now all prices and stations to have a closer look, how the prices are raising and decreasing. As we see from cell above, we have NaN values insight our data. I will leave it here and drop it later for the specific data if it is necessary.
# how does the data look like for one station
prices_pd[prices_pd['station_uuid'] == '005056ba-7cb6-1ed2-bceb-82ea369c0d2d']
# get data for one station and type "diesel"
star_bahn = get_data4uid(prices_pd, '005056ba-7cb6-1ed2-bceb-82ea369c0d2d', 'diesel')
star_bahn.info()
star_bahn.head()
#let's have a look at the data
star_bahn.plot(figsize=(15,8));
# get samples per day
star_day = star_bahn.resample('d').mean()
# now remove the np.nan values
star_day.dropna(inplace=True)
star_day.plot(figsize=(15,8));
star_day.isna().sum()
So we have no np.nan anymore in our dataset, but we have a few day in 2018 without data. Possible the station was closed in this time. I'll think this should not be a problem.
# get another station
total = get_data4uid(prices_pd, '79fb1f24-bebb-489e-841f-728f9053b555', 'diesel')
total_day = total.resample('d').mean()
total_day.diesel.idxmax()
# plot both stations in one plot to see
plt.figure(figsize=(15,8));
plt.plot(star_day, label = 'Star');
plt.plot(total_day, label = 'Total');
plt.title('Diesel price, Star/Total')
plt.ylabel('price')
plt.legend();
The price of Total diesel price somewhre between july/august of 2017 is one crazy peak. Let's have a closer look...
total_day['2017-07':'2017-08'].diesel.idxmax()
total_day['2017-08-12':'2017-08-18'].diesel
total['2017-08-13':'2017-08-17'].diesel
The data seems to be valid, it could be that some dataset is missing and therefore the step is so big. On 14th there are only two values and on 15+16th of august there is only one value. It is a little bit strange, but it can be.
# want to see the differences from day to day
star_day_diff = star_day.diff()
total_day_diff = total_day.diff()
plt.figure(figsize=(15,8))
plt.plot(star_day_diff)
plt.plot(total_day_diff)
plt.grid(axis='x')
plt.legend(['Star', 'Total']);
between days, but now let's have a closer look in a shorter time frame and the corresponding prices. The two choosen stations are in same location around 500 m distance.
star_intraday = star_bahn['2017-05-01':'2017-05-07']
total_intraday = total['2017-05-01':'2017-05-07']
plt.figure(figsize=(15,8))
plt.plot(star_intraday)
plt.plot(total_intraday)
plt.grid(axis='x')
plt.legend(['Star', 'Total']);
In the plot above you can easily find who follows whom: The Total is raising the price first and Star is following. In price reduction Star is first and Total follows. Interesting :-)
How does it look during a day? When is the lowest price on a day, when is the highest in the mean over all days? Therefore I make a function to plot the data.
def plot_day4uuid(uuid, typ='diesel', name=None, save=None):
'''
plots a distribution of prices on a day based on last year and for 2019
Input:
uuid : id of station
typ : which type of gas
name : Title for the plot
save : filename to save'''
hourly_ticks = 60 * 60 * np.arange(24)
data = get_data4uid(prices_pd, uuid, typ)
# drop wrong data
data.drop(data[data[typ] < 0.5].index, inplace=True)
data.dropna(inplace=True)
# resample to hourly values
data_h = data.resample('H').mean().dropna()
# groupby hour for all data
data_byday = data_h.groupby(data_h.index.time).mean()
# groupby hour only for 2019
data_2019 = data_h['2019-01-01':]
data_byday_2019 = data_2019.groupby(data_2019.index.time).mean()
plt.figure(figsize=(15,8))
plt.plot(data_byday, label='Hourly Mean')
plt.plot(data_byday_2019, label='Hourly Mean 2019')
if name:
plt.title(name)
else:
plt.title(uuid)
plt.legend()
plt.xticks(hourly_ticks)
if save:
plt.savefig(save)
plot_day4uuid('005056ba-7cb6-1ed2-bceb-82ea369c0d2d', 'diesel', 'Star', save='./star_hourly_mean.png')
plot_day4uuid('79fb1f24-bebb-489e-841f-728f9053b555', 'diesel', 'Total')
plot_day4uuid('51d4b59c-a095-1aa0-e100-80009459e03a', name='JET Koelner', save='./jetkoelner.png')
plot_day4uuid('005056ba-7cb6-1ed2-bceb-a46e32000d3e', name='Star Richrather', save='./starrichrather.png')
star_bahn.describe()
It seems in the examples above that in average it is a good to get fuel in the evening, depending on your preferred station beween 18-19 o'clock (Star) or 16-19 o'clock (Total). The standard deviation is 9 cent, so it can be money saving to get fuel at the right time.
To answer this question i grouped the data by hour and weekday.
def plot_weekday4uuid(uuid, typ='diesel', name=None, save=None):
'''
plots a distribution of prices for weekday based on last years and for 2019
Input:
uuid : id of station
typ : which type of gas
name : Title for the plot
save : filename to save'''
hourly_ticks = np.arange(25)
data = get_data4uid(prices_pd, uuid, typ)
# drop wrong data
data.dropna(inplace=True)
# resample to hourly values
data = data.resample('H').mean().dropna()
# groupby hour only for 2019
data_2019 = data['2019-01-01':].copy()
data['day'] = data.index.weekday
data['hour'] = data.index.hour
data_2019['day'] = data_2019.index.weekday
data_2019['hour'] = data_2019.index.hour
fig, [ax, ax2] = plt.subplots(2, 1, figsize=(15,12), constrained_layout=True)
data.groupby(['hour','day']).mean()['diesel'].unstack().plot(ax=ax)
data_2019.groupby(['hour','day']).mean()['diesel'].unstack().plot(ax=ax2)
if name:
ax.set_title(name)
ax2.set_title(name+' 2019')
else:
ax.set_title(uuid)
ax.set_title(uuid+' 2019')
#ax.legend()
ax.set_xticks(hourly_ticks)
ax2.set_xticks(hourly_ticks)
#ax.set_xlim(6,23)
#ax2.set_xlim(6,23)
fig.suptitle('Hourly prices grouped on weekdays', fontsize=16)
if save:
fig.savefig(save)
plot_weekday4uuid('005056ba-7cb6-1ed2-bceb-82ea369c0d2d', 'diesel', 'Star', './hourly_based_weekdays.png')
Here we have a big difference in the years before 2019, there was Saturday and Sunday at 8 pm a obvious lowest price. In 2019 this is not more noticeable on Sundays but on Saturday it seems to be still the best time for cheapest prices.
Now let's see some statistics from all stations in this area and make a nice plot of it with plotly.
prices_pd
prices_pd.diesel.dropna(inplace=True)
# group on station_uuid and have a look at "diesel"
grouped = prices_pd.groupby('station_uuid')[['diesel']]
grouped.describe()
# calculate some statistical values
var_price = grouped.var()
min_price = grouped.min()
std_price = grouped.std()
var_price.rename(inplace=True, columns={"diesel":"var"})
min_price.rename(inplace=True, columns={"diesel":"min"})
std_price.rename(inplace=True, columns={"diesel":"std"})
short_pd = stations_pd.copy()
short_pd.set_index(['uuid'], inplace=True)
short_pd = short_pd.join(var_price)
short_pd = short_pd.join(min_price)
short_pd = short_pd.join(std_price)
short_pd.dropna(inplace=True, subset=['var', 'min', 'std'])
short_pd.info()
fig = px.scatter_mapbox(short_pd, lat="latitude", lon="longitude", hover_name="name", hover_data=["city", "post_code"],
color_discrete_sequence=["fuchsia"], zoom=10, width=700, height=700, size="var", color="min", title="Variance(size) and Min price")
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()